# print info on design, number of cells
cat(paste('number of subjects = ',30,'\n',
          '---','\n',
          'number of trials per subject = ',256,'\n', 
          '---','\n',          
          'number of unique cells in the design = Item(4) * Number(2) * Vagueness(2) * Order(2) * Quantity(2) = ',4*2*2*2*2,'\n',
          'number of times a subject saw a unique cell in the design = 256 / 64 = ', 256/64,'\n',
          '---','\n',
          'number of higher-level cells in the design (abstracting over Order and Quantity) = Item(4) * Number(2) * Vagueness(2) = ',4*2*2,'\n',
          'number of times a subject saw a higher level cell in the design =  = 256 / 16 = ', 4*2*2, '\n',
          sep=''))
## number of subjects = 30
## ---
## number of trials per subject = 256
## ---
## number of unique cells in the design = Item(4) * Number(2) * Vagueness(2) * Order(2) * Quantity(2) = 64
## number of times a subject saw a unique cell in the design = 256 / 64 = 4
## ---
## number of higher-level cells in the design (abstracting over Order and Quantity) = Item(4) * Number(2) * Vagueness(2) = 16
## number of times a subject saw a higher level cell in the design =  = 256 / 16 = 16

Distributional stuff, including transformations.

require(xtable, quietly=TRUE)
x=445
lambda = c(-2,-1,-0.5, 0,0.5,1,2)

xd=data.frame(
  lambda = c(
    lambda[1],
    lambda[2],
    lambda[3],
    lambda[4],
    lambda[5],
    lambda[6],
    lambda[7]
  ),
  x=c(    
    paste("x = ", sep="", x),
    paste("x = ", sep="", x),
    paste("x = ", sep="", x),
    paste("x = ", sep="", x),
    paste("x = ", sep="", x),
    paste("x = ", sep="", x),
    paste("x = ", sep="", x)
     ),
  conventional=c(
    '$$y=\\frac{1}{x^2}$$', 
    '$$y=\\frac{1}{x}$$', 
    '$$y=\\frac{1}{\\sqrt{x}}$$', 
    '$$y=log_e(x)$$', 
    '$$y=\\sqrt{x}$$', 
    '$$y=x$$', 
    '$$y=x^2$$'
    ),
#   y1=c(    
#     paste("y = ", sep="", 1/(x^2)),
#     paste("y = ", sep="", 1/x),
#     paste("y = ", sep="", 1/sqrt(x)),
#     paste("y = ", sep="", round(log(x),2)),
#     paste("y = ", sep="", sqrt(x)),
#     paste("y = ", sep="", x),
#     paste("y = ", sep="", x^2)
#   ),
  power=c(
    '$$y=x^{-2}$$', 
    '$$y=x^{-1}$$', 
    '$$y=x^{-0.5}$$', 
    '$$y=log_e(x)$$', 
    '$$y=x^{0.5}$$', 
    '$$y=x^{1}$$', 
    '$$y=x^{2}$$'
    ),
  y=c(     
    paste("y = ", sep="", format(round(x^lambda[1],6), sc=F) ),
    paste("y = ", sep="", round(x^lambda[2],3) ),
    paste("y = ", sep="", round(x^lambda[3],3) ),
    paste("y = ", sep="", round(log(x),3) ),
    paste("y = ", sep="", round(x^lambda[5],2) ),
    paste("y = ", sep="", round(x^lambda[6],6) ),
    paste("y = ", sep="", round(x^lambda[7],6) )
  ),
  back = c(
    '$$x=y^{\\frac{1}{-2}}$$',
    '$$x=y^{\\frac{1}{-1}}$$', 
    '$$x=y^{\\frac{1}{-0.5}}$$', 
    '$$x=exp(y)$$',  
    '$$x=y^{\\frac{1}{0.5}}$$', 
    '$$x=y^{\\frac{1}{1}}$$',  
    '$$x=y^{\\frac{1}{2}}$$'
  ),
  back_lambda=c(
    '$$x=y^{1/\\lambda}$$',
    '$$x=y^{1/\\lambda}$$',
    '$$x=y^{1/\\lambda}$$',
    '$$x=exp(y)$$',  
    '$$x=y^{1/\\lambda}$$',
    '$$x=y^{1/\\lambda}$$',
    '$$x=y^{1/\\lambda}$$'
  ),
  y_back=c(  
    paste( 'x = ', (x^lambda[1]) ^ {1/lambda[1]} ,sep=""),
    paste( 'x = ', (x^lambda[2]) ^ {1/lambda[2]} ,sep=""),
    paste( 'x = ', (x^lambda[3]) ^ {1/lambda[3]} ,sep=""),
    paste( 'x = ', exp(log(x)), sep=""),
    paste( 'x = ', (x^lambda[5]) ^ {1/lambda[5]} ,sep=""),
    paste( 'x = ', (x^lambda[6]) ^ {1/lambda[6]} ,sep=""),
    paste( 'x = ', (x^lambda[7]) ^ {1/lambda[7]} ,sep="")
  )
) 

print(xtable(xd, digits=1, align=c(rep('l', times=ncol(xd)+1))), type='html', include.rownames=F, digits=1)
lambda x conventional power y back back_lambda y_back
-2.0 x = 445 \[y=\frac{1}{x^2}\] \[y=x^{-2}\] y = 0.000005 \[x=y^{\frac{1}{-2}}\] \[x=y^{1/\lambda}\] x = 445
-1.0 x = 445 \[y=\frac{1}{x}\] \[y=x^{-1}\] y = 0.002 \[x=y^{\frac{1}{-1}}\] \[x=y^{1/\lambda}\] x = 445
-0.5 x = 445 \[y=\frac{1}{\sqrt{x}}\] \[y=x^{-0.5}\] y = 0.047 \[x=y^{\frac{1}{-0.5}}\] \[x=y^{1/\lambda}\] x = 445
0.0 x = 445 \[y=log_e(x)\] \[y=log_e(x)\] y = 6.098 \[x=exp(y)\] \[x=exp(y)\] x = 445
0.5 x = 445 \[y=\sqrt{x}\] \[y=x^{0.5}\] y = 21.1 \[x=y^{\frac{1}{0.5}}\] \[x=y^{1/\lambda}\] x = 445
1.0 x = 445 \[y=x\] \[y=x^{1}\] y = 445 \[x=y^{\frac{1}{1}}\] \[x=y^{1/\lambda}\] x = 445
2.0 x = 445 \[y=x^2\] \[y=x^{2}\] y = 198025 \[x=y^{\frac{1}{2}}\] \[x=y^{1/\lambda}\] x = 445
if(require(MASS)){}
if(require(ggplot2)){}
if(require(lmerTest)){}
if(require(xtable)){}
if(require(plyr)){}
if(require(reshape2)){}
if(require(lattice)){}
if(require(LMERConvenienceFunctions)){} # for perSubjectTrim
if(require(car)){}
if(require(rms)){} # for varclus
if(require(gridExtra)){}
if(require(grid)){}
if(require(latticeExtra)){}
if(require(beepr)){}

Get the data if file dat.Rda does not already exist.

# if the output file dat.Rda already exists then load it, else do data wrangling
if( file.exists('dat.Rda') ) load('dat.Rda') else {
  
  # source required data wrangling functions
  source('functions/classifyResponse.R')
  source('functions/gatherData.R')
  
  # declare local variables
  number_of_valid_subjects <- 30
  number_of_rows <- 7680 # 7680
  number_of_trials_per_subject <- number_of_rows / number_of_valid_subjects # 256
  
    ## call user-defined functions
  
  # collect all individual subject files into one data frame
  dat <- gatherData(number_of_valid_subjects)
  
  # classify the response as expected, near, or far
  dat <- classifyResponse(dat)
  
   ## treat variables
  
  # SUBJECT
  # ensure Subject is a factor 
  dat$Subject=factor(paste("s",sprintf("%02d",dat$Subject),sep=""))
  
  # TRIAL
  # Trial for subject, 1 to 256
  dat$Trial = rep(x = 1:number_of_trials_per_subject, times = number_of_valid_subjects)
  
  # make a centred Trial for modeling
  dat$c_Trl <-dat$Trial - mean(dat$Trial)
  
  # make a scaled Trial for modelling
  dat$s_Trl <- as.numeric(scale(dat$Trial))
  
  # ID
  # id is a unique identifier for the 7680 row data
  dat$id <- factor(paste(paste(dat$Subject), 
                         paste("t", sprintf("%03d", dat$Trial), sep="") , sep=":"))
  
  # ITEM
  # create a centred numeric item variable for modeling
  dat$c_Itm <- ifelse(dat$Item==1, -.75, 
                      ifelse(dat$Item==2, -.25,
                             ifelse(dat$Item==3, .25, .75)))
  
  # make Item be a factor and assign labels
  dat$Item <- factor(dat$Item, levels=c(1,2,3,4), labels=c("06:15:24", "16:25:34", "26:35:44", "36:45:54"))
  
  # add ratios for Item
  item_range_ratio = c(6 / 24, 16 / 34, 26 / 44, 36 / 54) 
          # 0.2500000 0.4705882 0.5909091 0.6666667
  item_range_ratio_scaled = as.vector(scale(c(6 / 24, 16 / 34, 26 / 44, 36 / 54)))
          # -1.3441995 -0.1316642  0.5297187  0.9461450
  item_mean_ratio = c(mean(c(6 / 15, 15 / 24)), mean(c(16 / 25, 25 / 34)), mean(c(26 /35, 35 / 44)), mean(c(36 / 45, 45 / 54))) 
          # 0.5125000 0.6876471 0.7691558 0.8166667
  item_mean_ratio_scaled = as.vector(scale(c(mean(c(6 / 15, 15 / 24)), mean(c(16 / 25, 25 / 34)), mean(c(26 /35, 35 / 44)), mean(c(36 / 45, 45 / 54))))) 
          # -1.37582241 -0.06614191  0.54334858 0.89861574
  
  dat[dat$Item == "06:15:24", "item_range_ratio"] <-  0.2500000
  dat[dat$Item == "16:25:34", "item_range_ratio"] <-  0.4705882
  dat[dat$Item == "26:35:44", "item_range_ratio"] <-  0.5909091
  dat[dat$Item == "36:45:54", "item_range_ratio"] <-  0.6666667
  
  dat[dat$Item == "06:15:24", "item_range_ratio_scaled"] <-  -1.3441995
  dat[dat$Item == "16:25:34", "item_range_ratio_scaled"] <-  -0.1316642
  dat[dat$Item == "26:35:44", "item_range_ratio_scaled"] <-   0.5297187  
  dat[dat$Item == "36:45:54", "item_range_ratio_scaled"] <-   0.9461450
  
  dat[dat$Item == "06:15:24", "item_mean_ratio"] <-  0.5125000 
  dat[dat$Item == "16:25:34", "item_mean_ratio"] <-  0.6876471 
  dat[dat$Item == "26:35:44", "item_mean_ratio"] <-  0.7691558
  dat[dat$Item == "36:45:54", "item_mean_ratio"] <-  0.8166667
  
  dat[dat$Item == "06:15:24", "item_mean_ratio_scaled"] <-   -1.37582241 
  dat[dat$Item == "16:25:34", "item_mean_ratio_scaled"] <-   -0.06614191 
  dat[dat$Item == "26:35:44", "item_mean_ratio_scaled"] <-    0.54334858 
  dat[dat$Item == "36:45:54", "item_mean_ratio_scaled"] <-    0.89861574
  
  # VAGUENESS
  # Create a factor coding for Vagueness
  dat[ dat$Condition==1 , 'Vagueness'] <- 'Vague'
  dat[ dat$Condition==2 , 'Vagueness'] <- 'Crisp'
  dat[ dat$Condition==3 , 'Vagueness'] <- 'Vague'
  dat[ dat$Condition==4 , 'Vagueness'] <- 'Crisp'
  dat$Vagueness <- as.factor(dat$Vagueness)
  
  # manually center Vagueness
  dat$c_Vag <- ifelse(dat$Vagueness=='Crisp', -.5, .5)
  
  # NUMBER
  # Create a factor coding for Number use
  dat[ dat$Condition==1 , 'Number'] <- 'Numeric'
  dat[ dat$Condition==2 , 'Number'] <- 'Numeric'
  dat[ dat$Condition==3 , 'Number'] <- 'Verbal'
  dat[ dat$Condition==4 , 'Number'] <- 'Verbal'
  dat$Number <- as.factor(dat$Number)
  
  # manually center Number
  dat$c_Num <- ifelse(dat$Number=='Numeric', -.5, .5)
  
  # CONDITION
  # make a factor out of Condition, as f_Cnd
  dat$f_Cnd <- factor(dat$Condition, levels=c(1,2,3,4), labels=c('Vg:Nm', 'Cr:Nm', 'Vg:Vb', 'Cr:Vb'))
  
  # ORDER
  # give the levels of Order meaningful names
  dat$Order <- factor(dat$Order, levels=c(1,2), labels=c('LtoR','RtoL'))
  
  # make a manually centred Order
  dat$c_Ord <- ifelse(dat$Order=="LtoR",-.5,.5)
  
  # QUANTITY
  # give the levels of Quantity meaningful names
  dat$Quantity <- factor(dat$Quantity, levels=c(1,2), labels=c('Small','Large'))
  
  # make a manually centred Quantity
  dat$c_Qty <- ifelse(dat$Quantity=="Small",-.5,.5)
  
  # INSTRUCTION
  # add number of characters in the instruction # 29 30 34 36 38
  dat$nchar_instr = nchar(dat$Instruction)
  dat$nchar_instr_scaled = as.vector(scale(nchar(dat$Instruction), scale=TRUE))
  
  # make Instruction be a factor (17 levels)
  dat$Instruction <- as.factor(dat$Instruction) 
  
  # RT
  # add transformations of RT
  dat$RT_RecSqd    <- 1/dat$RT^2
  dat$RT_Rec       <- 1/dat$RT
  dat$RT_RecSqt    <- 1/sqrt(dat$RT) 
  dat$RT_log       <- log(dat$RT)
  dat$RT_sqt       <- sqrt(dat$RT)
  dat$RT_raw       <- dat$RT
  dat$RT_sqd       <- dat$RT^2 
  
  # print to file a table with information about the design
  design_info <- unique(subset(dat, select=c(Item, Condition, Vagueness, Number, Quantity, Order, 
                                             Left, Mid, Right, Exp_Num, Bline_Num, Extr_Num, Exp_side, Bline_side, Extr_side,Instruction)))
  design.info <- design_info[order(design_info$Item, design_info$Condition, design_info$Quantity, design_info$Order),]
  row.names(design_info) <- NULL
  capture.output(  print.data.frame(design_info, row.names=F, print.gap=3, quote=F, right=F), file="design_info.table")
  
  # put dat in better column order
  dat <- subset(dat, 
                select = c(id, Subject, Trial, Condition, Order, Quantity, Vagueness, Number, 
                           Item, item_range_ratio, item_range_ratio_scaled, item_mean_ratio, item_mean_ratio_scaled,
                           c_Trl, s_Trl, c_Itm, c_Vag, c_Num, f_Cnd, c_Ord, c_Qty,
                           RT, RT_RecSqd, RT_Rec, RT_RecSqt, RT_log, RT_sqt, RT_raw, RT_sqd,
                           Exp_Num, Bline_Num, Extr_Num, Exp_side, Bline_side, Extr_side, response_num, response_side, response_category, Left, Mid, Right,
                           Instruction, nchar_instr
                ))
  
  # save dat as dat.Rda
  # This data set contains *all* trials 7680 including impossible trials and is mainly for graphs comparing different removals
  save(dat, file="dat.Rda")
  
  ####################################################
  # remove impossible trials from dat in new data dd #
  ####################################################
  
  # Throw out RT = 1 and RT = 59998, and RTprev = 1 and RTprev = 59998 
  # i.e., throw out sticky fingers and timeouts, 
  # and the trials that followed sticky fingers and timeouts 
  # since they were likely affected by unusual previous trials.
  
  # lose impossible trials
  dd <- dat
  dd$RT[dd$RT == 1 ] <- NA
  dd$RT[dd$RT == 59998 ] <- NA
  dd <- dd[complete.cases(dd),]
  row.names(dd) <- NULL
  
  # add preceding RT: because we removed impossible trials, the value for preceding RT for a trial following an impossible trial is the value of the trial that preceded the impossible trial.
  dd$RTprev <- NA
  for (s in levels(dd$Subject)) {
    nrows = nrow(dd[dd$Subject==s,])
    for (i in 1:nrows) {
      if (i==1) {
        dd[dd$Subject==s, "RTprev"][i] <- dd[dd$Subject==s, "RT"][i]
      }
      else
        dd[dd$Subject==s, "RTprev"][i] <- dd[dd$Subject==s, "RT"][i-1]
    }
  }
  
  # add transformations of previous RT
  dd$RTprev_RecSqd    <- 1/dd$RTprev^2
  dd$RTprev_Rec       <- 1/dd$RTprev
  dd$RTprev_RecSqt0   <- 1/sqrt(dd$RTprev) 
  dd$RTprev_RecSqt    <- as.vector(scale( 1/sqrt(dd$RTprev)  ))
  dd$RTprev_log       <- log(dd$RTprev)
  dd$RTprev_sqt       <- sqrt(dd$RTprev)
  dd$RTprev_raw       <- dd$RTprev
  dd$RTprev_sqd       <- dd$RTprev^2 
  
  # number of higher-level cells in the design (abstracting over Order) = Item(4) * Number(2) * Vagueness(2) * Quantity(2) = 32
  # number of measurements in eaach cell = 8
  # 332 * 8 = 256
  # 256 * 30 = 7680
  
  dd$measurement=0
  dd$cell=0
  for (s in levels(dd$Subject)) {
    cellcount=0
    for (i in levels(dd$Item)) {
      for (n in levels(dd$Number)) {
        for (v in levels(dd$Vagueness)) {
          for (q in levels(dd$Quantity)) {
            measurementcount=0
            cellcount=cellcount+1
            for (t in dd[dd$Subject==s & dd$Item==i & dd$Number==n & dd$Vagueness==v & dd$Quantity == q, "Trial"]) {
              dd[dd$Subject==s & dd$Item==i & dd$Number==n & dd$Vagueness==v & dd$Quantity == q & dd$Trial==t, "cell"] <- cellcount
              measurementcount=measurementcount+1
              dd[dd$Subject==s & dd$Item==i & dd$Number==n & dd$Vagueness==v & dd$Quantity == q & dd$Trial==t, "measurement"] = measurementcount
            }
          }
        }
      }
    }    
  }
  
  # put dd in better column order
  dd <- subset(dd, 
               select = c(id, Subject, Trial, Condition, Order, Quantity, Vagueness, Number, 
                          Item, item_range_ratio, item_range_ratio_scaled, item_mean_ratio, item_mean_ratio_scaled,
                          c_Trl, s_Trl, c_Itm, c_Vag, c_Num, f_Cnd, c_Ord, c_Qty,
                          RT, RT_RecSqd, RT_Rec, RT_RecSqt, RT_log, RT_sqt, RT_raw, RT_sqd,
                          RTprev, RTprev_RecSqd, RTprev_Rec, RTprev_RecSqt, RTprev_log, RTprev_sqt, RTprev_raw, RTprev_sqd,
                          Exp_Num, Bline_Num, Extr_Num, Exp_side, Bline_side, Extr_side, response_num, response_side, response_category, Left, Mid, Right,
                          Instruction, nchar_instr,
                          cell, measurement
               ))
  
  # save dd
  save(dd, file="dd.Rda")
}

Compare distributions of the various transformations of RT against random samples from normal distributions with the same mean and sd to see which transformations best approximate normal distributions.

## plot the comparison of the transformations against their normal equivalents to see distributional properties using full data

suppressPackageStartupMessages(library("reshape2"))
suppressPackageStartupMessages(library("ggplot2"))
load("dat.Rda")

# first create a data frame specially for transformations
# this won't be used for analysis, just for this plot
dat_transforms <- dat

# dfSubset says whether any data points were removed
dat_transforms$dfSubset <- "original"

# RTtype says whether the row contains data for an observed RT or 
# a sample from a normal distribution with the same mean and sd
dat_transforms$RTtype <- "observed"

# create extra rows for dfs with removed (outlier) data points
dat1 <- dat_transforms                                       # nrow 7680
dat2 <- subset(dat_transforms, RT>1)                         # nrow 7678
dat2$dfSubset <- "RT>1"
dat3 <- subset(dat_transforms, RT>1 & RT<59999)              # nrow 7677
dat3$dfSubset <- "RT>1&RT<59999"
dat4 <- subset(dat_transforms, RT>1 & RT<59999 & RT<30001)   # nrow 7658
dat4$dfSubset <- "RT>1&RT<30001"

# bring the dfs back into main df
dat_transforms <- rbind(dat1, dat2, dat3, dat4)
dat_transforms$dfSubset <- factor(dat_transforms$dfSubset, levels=c("original", "RT>1", "RT>1&RT<59999", "RT>1&RT<30001"))

# create extra rows for normally distributed transformed variables
dato = dat_transforms
datn = dat_transforms; datn$RTtype <- "normal"
dat_transforms <- rbind(dato,datn)
dat_transforms$RTtype <- factor(dat_transforms$RTtype, levels=c("observed","normal"))

# for safety
dat_transforms[dat_transforms$RTtype=="normal", c("RT_RecSqd", "RT_Rec", "RT_RecSqt", "RT_log", "RT_sqt", "RT_raw", "RT_sqd")] <- NA

# Create the normally distributed transformed variables, replacing the NAs declared above
for (k in levels(dat_transforms$dfSubset)){
  dat_transforms[dat_transforms$dfSubset==k & dat_transforms$RTtype=="normal", "RT_RecSqd"]  <- 
    rnorm(n=length(dat_transforms[dat_transforms$dfSubset==k    & dat_transforms$RTtype=="observed", "RT_RecSqd"]), 
          mean=mean(dat_transforms[dat_transforms$dfSubset==k & dat_transforms$RTtype=="observed", "RT_RecSqd"]), 
          sd=sd(dat_transforms[dat_transforms$dfSubset==k     & dat_transforms$RTtype=="observed", "RT_RecSqd"]))
  dat_transforms[dat_transforms$dfSubset==k & dat_transforms$RTtype=="normal", "RT_Rec"]  <- 
    rnorm(n=length(dat_transforms[dat_transforms$dfSubset==k    & dat_transforms$RTtype=="observed", "RT_Rec"]), 
          mean=mean(dat_transforms[dat_transforms$dfSubset==k & dat_transforms$RTtype=="observed", "RT_Rec"]), 
          sd=sd(dat_transforms[dat_transforms$dfSubset==k     & dat_transforms$RTtype=="observed", "RT_Rec"]))
  dat_transforms[dat_transforms$dfSubset==k & dat_transforms$RTtype=="normal", "RT_RecSqt"]  <- 
    rnorm(n=length(dat_transforms[dat_transforms$dfSubset==k    & dat_transforms$RTtype=="observed", "RT_RecSqt"]), 
          mean=mean(dat_transforms[dat_transforms$dfSubset==k & dat_transforms$RTtype=="observed", "RT_RecSqt"]), 
          sd=sd(dat_transforms[dat_transforms$dfSubset==k     & dat_transforms$RTtype=="observed", "RT_RecSqt"]))
  dat_transforms[dat_transforms$dfSubset==k & dat_transforms$RTtype=="normal", "RT_log"]  <- 
    rnorm(n=length(dat_transforms[dat_transforms$dfSubset==k    & dat_transforms$RTtype=="observed", "RT_log"]), 
          mean=mean(dat_transforms[dat_transforms$dfSubset==k & dat_transforms$RTtype=="observed", "RT_log"]), 
          sd=sd(dat_transforms[dat_transforms$dfSubset==k     & dat_transforms$RTtype=="observed", "RT_log"]))
  dat_transforms[dat_transforms$dfSubset==k & dat_transforms$RTtype=="normal", "RT_sqt"]  <- 
    rnorm(n=length(dat_transforms[dat_transforms$dfSubset==k    & dat_transforms$RTtype=="observed", "RT_sqt"]), 
          mean=mean(dat_transforms[dat_transforms$dfSubset==k & dat_transforms$RTtype=="observed", "RT_sqt"]), 
          sd=sd(dat_transforms[dat_transforms$dfSubset==k     & dat_transforms$RTtype=="observed", "RT_sqt"]))
  dat_transforms[dat_transforms$dfSubset==k & dat_transforms$RTtype=="normal", "RT_raw"]  <- 
    rnorm(n=length(dat_transforms[dat_transforms$dfSubset==k    & dat_transforms$RTtype=="observed", "RT_raw"]), 
          mean=mean(dat_transforms[dat_transforms$dfSubset==k & dat_transforms$RTtype=="observed", "RT_raw"]), 
          sd=sd(dat_transforms[dat_transforms$dfSubset==k     & dat_transforms$RTtype=="observed", "RT_raw"]))
  dat_transforms[dat_transforms$dfSubset==k & dat_transforms$RTtype=="normal", "RT_sqd"]  <- 
    rnorm(n=length(dat_transforms[dat_transforms$dfSubset==k    & dat_transforms$RTtype=="observed", "RT_sqd"]), 
          mean=mean(dat_transforms[dat_transforms$dfSubset==k & dat_transforms$RTtype=="observed", "RT_sqd"]), 
          sd=sd(dat_transforms[dat_transforms$dfSubset==k     & dat_transforms$RTtype=="observed", "RT_sqd"]))
}


temp <- melt(dat_transforms, 
             id.vars=c("dfSubset", "RTtype", "Item", "Vagueness", "Number", "Quantity", "Order"),
             measure.vars=c("RT_RecSqd", "RT_Rec", "RT_RecSqt", "RT_log", "RT_sqt", "RT_raw", "RT_sqd"),
             variable.name="transformation",
             value.name="score")

ggplot(temp, aes(score, colour=RTtype)) + 
  geom_freqpoly() + 
  facet_wrap(dfSubset~transformation, scales="free", nrow=4) +
  theme_bw() + theme(aspect.ratio=1) + scale_color_manual(values=c("blue","red"))

Now working with a data set without impossible trials.

Show how transformations of RT affect distribution of times, and how they affect which times are outliers.

# boxplot subjects and items

load("dd.Rda")
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(ggplot2))

# allow reference to transformation
subdata = melt(dd,
               measure.vars=c("RT_Rec", "RT_RecSqt", "RT_log", "RT_raw"),
               variable.name="transformation",
               value.name="value") 

# boxplots for subjects and items separately for each dependent variable
ggplot(subdata) + 
  facet_wrap(~ transformation, scales="free", ncol=4) +
  geom_boxplot(aes(y=value, x=Item, col="Items")) + 
  geom_boxplot(aes(y=value, x=Subject, col="Subjects")) + 
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1), aspect.ratio=1, panel.grid=element_blank()) + 
  scale_colour_manual("",values=c("Items"="red","Subjects"="blue")) +
  xlab("") + ylab("")

Show how mean times for individual subjects and items vary with respect to the grand mean RT.

# Show how mean times for individual subjects and items vary with respect to the grand mean RT.

suppressMessages(library(plyr))
suppressMessages(library(lattice))
suppressMessages(library(latticeExtra))

subs=ddply(dd, .(level=Subject), summarise, effect="subject", emean=mean(dd$RT), lmean=mean(RT), ldiff=mean(RT)-mean(dd$RT), lsign=ifelse(mean(RT)-mean(dd$RT)>0,"slow","fast"))
itms=ddply(dd, .(level=Item), summarise, effect="item", emean=mean(dd$RT), lmean=mean(RT), ldiff=mean(RT)-mean(dd$RT), lsign=ifelse(mean(RT)-mean(dd$RT)>0,"slow","fast"))
si=rbind(subs,itms)
si$effect <- factor(si$effect, levels=c("subject", "item"))
si$lsign <- factor(si$lsign, levels=c("slow","fast"))

my1 <- xyplot(data=si, lmean~level | effect, 
              aspect=1,
              layout=c(2,1),
              ylim = extendrange(si$lmean, f = 0.10),
              par.settings=list(trellis.par.set(superpose.symbol = list(col = c("blue","red"), pch=19), superpose.line=list(col = c("blue","red")))),
              groups=si$lsign,
              xlab=NULL,
              ylab="RT (in ms)",
              key=list(text=list(levels(si$lsign)), lines=list(lty=c(1,1), col=trellis.par.get('superpose.line')$col, pch=c(19,19), type='o'), divide=1),
              mean=unique(si$emean),
              scales=list(x=list(relation="free", at=list(15.5, 32.5), labels=list("subject number", "item identifier"), cex=1 ), 
                          y=list(at=c(0, 2000, unique(si$emean), 4000, 6000, 8000,  10000), labels=c(0, 2000,  "Mean", 4000,  6000, 8000, 10000), cex=.9) ),
              panel=function(x,y,type,subscripts,groups=groups,...){
                panel.xyplot(x,y,type='p',groups=groups, subscripts=subscripts,...)
                panel.arrows(x0=x, y0=unique(si$emean), x1=x, y1=y, groups=groups, subscripts=subscripts, code=3, length=0, col=trellis.par.get('superpose.line')$col[groups][subscripts], ...)
                panel.abline(h=unique(si$emean), col="lightgrey",lty=1)
                panel.text(x, y=(ifelse(y<unique(si$emean),y-300,y+300)), as.numeric(si$level)[subscripts][panel.number()==1], cex=.75)
                panel.text(x, y=(ifelse(y<unique(si$emean),y-300,y+300)), as.character(si$level)[subscripts][panel.number()==2], cex=.75)
              })

my2 <- xyplot(data=si, ldiff~level | effect, 
              aspect=1, type='n',
              ylab="Deviation from mean RT (in ms)",
              ylim=extendrange(si$ldiff,f=0.10))

print(doubleYScale(my1, my2, add.axis=TRUE, add.ylab2=TRUE, use.style=FALSE))

Show how mean times for individual subjects and items vary with respect to the grand mean RT_RecSqt.

# Show how mean times for individual subjects and items vary with respect to the grand mean RT_RecSqt.

suppressMessages(library(plyr))
suppressMessages(library(lattice))
suppressMessages(library(latticeExtra))

subs <- ddply(dd, .(level=Subject), summarise, 
              effect="subject", emean=mean(dd$RT_RecSqt), lmean=mean(RT_RecSqt), ldiff=mean(RT_RecSqt)-mean(dd$RT_RecSqt),
              lsign=ifelse(mean(RT_RecSqt)-mean(dd$RT_RecSqt)<0,"slow","fast"))
itms <- ddply(dd, .(level=Item), summarise, effect="item", emean=mean(dd$RT_RecSqt), lmean=mean(RT_RecSqt),
              ldiff=mean(RT_RecSqt)-mean(dd$RT_RecSqt), lsign=ifelse(mean(RT_RecSqt)-mean(dd$RT_RecSqt)<0,"slow","fast"))
si <- rbind(subs,itms)
si$effect <- factor(si$effect, levels=c("subject", "item"))
si$lsign <- factor(si$lsign, levels=c("slow","fast"))

my1 <- xyplot(data=si, lmean~level | effect, 
              aspect=1,
              layout=c(2,1),
              ylim = extendrange(si$lmean, f = 0.10),
              par.settings = list(trellis.par.set(superpose.symbol = list(col = c("blue","red"), pch=19), superpose.line=list(col = c("blue","red")))),
              groups = si$lsign,
              xlab = NULL,
              ylab = "RT_RecSqt: reciprocal of square root of RT in ms\n",
              key=list(text=list(levels(si$lsign)), lines=list(lty=c(1,1), col=trellis.par.get('superpose.line')$col, pch=c(19,19), type='o'), divide=1),
              mean = unique(si$emean),
              scales = list(x=list(relation="free", at=list(15.5, 32.5), labels=list("subject number", "item identifier"), cex=1 ) ),
              panel=function(x,y,type,subscripts,groups=groups,...){
                panel.xyplot(x,y,type='p',groups=groups, subscripts=subscripts,...)
                panel.arrows(x0=x, y0=unique(si$emean), x1=x, y1=y, groups=groups, subscripts=subscripts, code=3, length=0, col=trellis.par.get('superpose.line')$col[groups][subscripts], ...)
                panel.abline(h=unique(si$emean), col="lightgrey",lty=1)
                panel.text(x, y=(ifelse(y<unique(si$emean),y-.001,y+.001)), as.numeric(si$level)[subscripts][panel.number()==1], cex=.75)
                panel.text(x, y=(ifelse(y<unique(si$emean),y-.001,y+.001)), as.character(si$level)[subscripts][panel.number()==2], cex=.75)
              })

my2 <- xyplot(data=si, ldiff~level | effect, 
              aspect=1, type='n',
              ylab="\nDeviation from mean RT_RecSqt (in ms)",
              ylim=extendrange(si$ldiff,f=0.10))

print(doubleYScale(my1, my2, add.axis=TRUE, add.ylab2=TRUE, use.style=FALSE))

Show how practice and fatigue effects vary over subjects

if(require(ggplot2)){}
load("dd.Rda")
# Show how practice/fatigue effects vary over subjects
ggplot(dd, aes(y=RT_RecSqt, x=Trial)) + facet_wrap(~Subject,nrow=3) + theme(panel.grid=element_blank(), panel.margin = unit(0, "lines"), panel.background= element_rect(fill = 'white', colour='black' ), legend.position="top") + scale_x_continuous("Trial", breaks=NULL) + geom_point(pch=19, cex=.25, alpha=.5) + stat_smooth(fill='lightblue',col="red",lwd=.5, show.legend=TRUE) + theme(aspect.ratio = 1) 

Show practice and fatigue effects vary over items.

# Show how practice/fatigue effects vary over items
ggplot(dd, aes(y=RT_RecSqt, x=Trial)) + facet_grid(~Item) + theme(panel.grid=element_blank(), panel.margin = unit(0, "lines"), panel.background= element_rect(fill = 'white', colour='black' ), legend.position="top") + scale_x_continuous("Trial") + geom_point(pch=19, cex=.25, alpha=.5) + stat_smooth(fill='lightblue',col="red",lwd=.5, show.legend=TRUE) + theme(aspect.ratio = 1) 

Plot main effects and interactions

Main effects

Plot main effects on several transformations

# plot main effects on observed data

# allow reference to transformation
temp <- melt(dd,
             id.vars=c("Item", "Vagueness", "Number", "Quantity", "Order"),
             measure.vars=c("RT_Rec", "RT_RecSqt", "RT_log",  "RT_raw"),
             variable.name="transformation",
             value.name="score")

# allow reference to effect
temp2 <- melt(temp, 
              id.vars=c("transformation", "score"),
              measure.vars=c("Item", "Vagueness", "Number", "Quantity", "Order"),
              variable.name="effect",
              value.name="level")


# do the plot
ggplot(temp2, aes(y=score, x=level, group=1, col=effect)) +
  facet_wrap(~transformation + effect, scales="free", shrink=TRUE, ncol=5) +
  stat_summary(fun.data=mean_cl_normal, geom="pointrange") +
  stat_summary(fun.y=mean, geom="line") +
  theme(aspect.ratio=1, axis.text.y=element_blank())

Main effects

# prep main effects rts data frame
vrts <- cbind(Effect="Vagueness",x=factor(c(1,2)),summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("Vagueness")));names(vrts)[3]<-"Levels"
nrts <- cbind(Effect="Number",x=factor(c(1,2)), summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("Number")));names(nrts)[3]<-"Levels"
qrts <- cbind(Effect="Quantity",x=factor(c(1,2)), summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("Quantity")));names(qrts)[3]<-"Levels"
orts <- cbind(Effect="Order",x=factor(c(1,2)), summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("Order")));names(orts)[3]<-"Levels"
irts <- cbind(Effect="Item",x=factor(c(1,2,3,4)), summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("Item")));names(irts)[3]<-"Levels"
rts=rbind(vrts,nrts,qrts,orts)
rts$mins=rts$RT_RecSqt-rts$ci
rts$maxs=rts$RT_RecSqt+rts$ci
# plot main effects
ggplot(rts, aes(y=RT_RecSqt, x=Levels, group=1, ymin=mins, ymax=maxs)) + 
  geom_line() +
  geom_errorbar(width=.1) +
  geom_point(cex=3) + 
  facet_grid(~Effect, scale="free_x") + 
  theme_bw() +
  theme(aspect.ratio=1) 

Main effects over items

vrts <- cbind(Effect="Vagueness",x=factor(c(1,2)),summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("item_mean_ratio","Vagueness")));names(vrts)[4]<-"Levels"
nrts <- cbind(Effect="Number",x=factor(c(1,2)), summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("item_mean_ratio","Number")));names(nrts)[4]<-"Levels"
qrts <- cbind(Effect="Quantity",x=factor(c(1,2)), summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("item_mean_ratio","Quantity")));names(qrts)[4]<-"Levels"
orts <- cbind(Effect="Order",x=factor(c(1,2)), summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("item_mean_ratio","Order")));names(orts)[4]<-"Levels"

rts=rbind(vrts,nrts,qrts,orts)
rts$mins=rts$RT_RecSqt-rts$ci
rts$maxs=rts$RT_RecSqt+rts$ci

rts$item_mean_ratio <- as.numeric(as.character(rts$item_mean_ratio))

ggplot(rts, aes(y=RT_RecSqt, x=item_mean_ratio, group=Levels, ymin=mins, ymax=maxs, shape=Levels, fill=Levels, col=Levels)) +
  scale_shape_manual(values=c(21,21,22,22,23,23,24,24)) +
  scale_fill_manual(values=c("black","gray","black","gray","black","gray","black","gray")) +
  scale_color_manual(values=c("black","gray","black","gray","black","gray","black","gray")) +
  geom_line() +
  geom_errorbar(width=.1) +
  geom_point(cex=3) + 
  facet_grid(~Effect) + 
  theme_bw() +
  theme(aspect.ratio=1, 
        panel.grid=element_blank()) +
  scale_x_continuous(breaks = round(sort(unique(rts$item_mean_ratio)),1)) 

2 way interactions not involving items.

# prep 2 ways rts data frame
rts1 <- summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("Vagueness","Number")); names(rts1)[1] <- "F1"; names(rts1)[2] <- "F2"
rts1$Effect <- rep("Vagueness:Number",4)
rts2 <- summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("Vagueness","Quantity")); names(rts2)[1] <- "F1"; names(rts2)[2] <- "F2"
rts2$Effect <- rep("Vagueness:Quantity",4)
rts3 <- summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("Vagueness","Order")); names(rts3)[1] <- "F1"; names(rts3)[2] <- "F2"
rts3$Effect <- rep("Vagueness:Order",4)
rts5 <- summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("Number","Quantity")); names(rts5)[1] <- "F1"; names(rts5)[2] <- "F2"
rts5$Effect <- rep("Number:Quantity",4)
rts6 <- summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("Number","Order")); names(rts6)[1] <- "F1"; names(rts6)[2] <- "F2"
rts6$Effect <- rep("Number:Order",4)
rts8 <- summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("Quantity","Order")); names(rts8)[1] <- "F1"; names(rts8)[2] <- "F2"
rts8$Effect <- rep("Quantity:Order",4)
rts <- rbind(rts1,rts2,rts3,rts5,rts6,rts8)
rts <- subset(rts, select=c(Effect, F1, F2, RT_RecSqt, RT_RecSqt_norm, sd, se, ci))
rts$mins=rts$RT_RecSqt-rts$ci
rts$maxs=rts$RT_RecSqt+rts$ci
# plot 2 way interactions
ggplot(rts, aes(y=RT_RecSqt, x=F2, group=F1, ymin=mins, ymax=maxs, pch=F1)) + 
  geom_line() +
  geom_errorbar(width=.1) +
  geom_point(cex=3) + 
  facet_wrap(~Effect, scale="free_x", nrow=1) + 
  theme_bw() +
  theme(aspect.ratio=1) +
  theme(legend.title=element_blank()) +
  xlab("Levels")

Plot 2 way interactions over items

# prep 2 ways over items rts data frame
rts1 <- summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("item_mean_ratio", "Vagueness","Number")); names(rts1)[2] <- "F1"; names(rts1)[3] <- "F2"
rts1$E1 <- "Vagueness"
rts1$E2 <- "Number"
rts2 <- summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("item_mean_ratio", "Vagueness","Quantity")); names(rts2)[2] <- "F1"; names(rts2)[3] <- "F2"
rts2$E1 <- "Vagueness"
rts2$E2 <- "Quantity"
rts3 <- summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("item_mean_ratio", "Vagueness","Order")); names(rts3)[2] <- "F1"; names(rts3)[3] <- "F2"
rts3$E1 <- "Vagueness"
rts3$E2 <- "Order"
rts4 <- summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("item_mean_ratio", "Quantity", "Number")); names(rts4)[2] <- "F1"; names(rts4)[3] <- "F2"
rts4$E1 <- "Quantity"
rts4$E2 <- "Number"
rts5 <- summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("item_mean_ratio", "Order","Number")); names(rts5)[2] <- "F1"; names(rts5)[3] <- "F2"
rts5$E1 <- "Order"
rts5$E2 <- "Number"
rts6 <- summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("item_mean_ratio", "Quantity","Order")); names(rts6)[2] <- "F1"; names(rts6)[3] <- "F2"
rts6$E1 <- "Quantity"
rts6$E2 <- "Order"



rts <- rbind(rts1,rts2,rts3,rts4,rts5,rts6)
rts$mins=rts$RT_RecSqt-rts$ci
rts$maxs=rts$RT_RecSqt+rts$ci

rts$item_mean_ratio <- as.numeric(as.character(rts$item_mean_ratio))
rts$F3 <- factor(paste(rts$F1,rts$F2))

rts$E1 <- factor(rts$E1, levels=c("Number","Order","Quantity","Vagueness"))
rts$E2 <- factor(rts$E2, levels=c("Number","Order","Quantity","Vagueness"))

# The palette with black:
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")


ggplot(rts, aes(y=RT_RecSqt, x=item_mean_ratio, group=F3, col=F2, pch=F1)) + 
  facet_grid(E2~E1, drop=F) + 
  geom_point(cex=3) + geom_line() + theme_bw() + 
  theme(aspect.ratio=1, panel.grid=element_blank()) +
  scale_color_manual(name="colour",values=cbbPalette) +
  scale_shape_discrete(name="shape") +
  scale_x_continuous(breaks = round(sort(unique(rts$item_mean_ratio)),1))

Plot vagueness by number interaction over items

rts <- summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("Number","Vagueness","Item"))
rts$condition=paste(rts$Number,rts$Vagueness, sep= ' ')
dodge = position_dodge(width=0.2)
ggplot(rts, aes(y=RT_RecSqt, x=Item, ymin=RT_RecSqt-ci, ymax=RT_RecSqt+ci, 
                group=condition, shape=condition, fill=condition)) +
  geom_line(position=dodge) + 
  geom_errorbar(width=.2, position=dodge) + 
  geom_point(size=4, position=dodge) + 
  scale_shape_manual("",values = c(22, 22, 21, 21)) + 
  scale_fill_manual("",values=c("black","white","black","white")) +
  ggtitle("Response time") + 
  ylab("RT_RecSqt") + 
  xlab("") +
  theme_bw() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.key = element_blank(), aspect.ratio=1, axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_grid(~Number)

3 way interactions

rts <- summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("Number","Vagueness","Quantity"))
rts$mins=rts$RT_RecSqt-rts$ci
rts$maxs=rts$RT_RecSqt+rts$ci
p1=ggplot(rts, aes(y=RT_RecSqt, x=Number, group=Vagueness, ymin=mins, ymax=maxs, pch=Vagueness)) + 
  geom_line() +
  geom_errorbar(width=.1) +
  geom_point(cex=3) + 
  facet_grid(~Quantity, scale="free_x") + 
  theme_bw() +
  theme(aspect.ratio=1) +
  theme(legend.title=element_blank(), legend.position="top")
rts <- summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("Number","Vagueness","Order"))
rts$mins=rts$RT_RecSqt-rts$ci
rts$maxs=rts$RT_RecSqt+rts$ci
p2=ggplot(rts, aes(y=RT_RecSqt, x=Number, group=Vagueness, ymin=mins, ymax=maxs, pch=Vagueness)) + 
  geom_line() +
  geom_errorbar(width=.1) +
  geom_point(cex=3) + 
  facet_grid(~Order, scale="free_x") + 
  theme_bw() +
  theme(aspect.ratio=1) +
  theme(legend.title=element_blank(), legend.position="top")
rts <- summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("Number","Quantity","Order"))
rts$mins=rts$RT_RecSqt-rts$ci
rts$maxs=rts$RT_RecSqt+rts$ci
p3=ggplot(rts, aes(y=RT_RecSqt, x=Number, group=Quantity, ymin=mins, ymax=maxs, pch=Quantity)) + 
  geom_line() +
  geom_errorbar(width=.1) +
  geom_point(cex=3) + 
  facet_grid(~Order, scale="free_x") + 
  theme_bw() +
  theme(aspect.ratio=1) +
  theme(legend.title=element_blank(), legend.position="top")
rts <- summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("Vagueness","Quantity","Order"))
rts$mins=rts$RT_RecSqt-rts$ci
rts$maxs=rts$RT_RecSqt+rts$ci
p4=ggplot(rts, aes(y=RT_RecSqt, x=Vagueness, group=Quantity, ymin=mins, ymax=maxs, pch=Quantity)) + 
  geom_line() +
  geom_errorbar(width=.1) +
  geom_point(cex=3) + 
  facet_grid(~Order, scale="free_x") + 
  theme_bw() +
  theme(aspect.ratio=1) +
  theme(legend.title=element_blank(), legend.position="top")
grid.arrange(nrow=2, p1,p2,p3,p4)

Plot vagueness by number by quantity over items.

rts <- summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("Number","Vagueness","Item", "Quantity"))
rts$condition=paste(rts$Number,rts$Vagueness, sep= ' ')
dodge = position_dodge(width=0.2)
ggplot(rts, aes(y=RT_RecSqt, x=Item, ymin=RT_RecSqt-ci, ymax=RT_RecSqt+ci, 
                group=Vagueness, 
                fill=Vagueness)) +
  geom_line(position=dodge) + 
  geom_errorbar(width=.2, position=dodge) + 
  geom_point(pch=21, size=4, position=dodge) + 
  scale_fill_manual("",values=c("black","white")) +
  ggtitle("Response time") + 
  ylab("RT_Rec_Sqt") + 
  xlab("") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
        panel.background=element_rect(fill="white",color=1),
        legend.key = element_blank(), 
        aspect.ratio=1, axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_grid(~Number+Quantity)

4 way interaction

rts <- summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("Number", "Vagueness",  "Quantity", "Order"))
rts$condition=paste(rts$Number,rts$Vagueness, sep= ' ')
ggplot(rts, aes(y=RT_RecSqt, x=Vagueness, ymin=RT_RecSqt-ci, ymax=RT_RecSqt+ci, group=Quantity, pch=Quantity)) +
  facet_grid(~Order+Number, scale="free_x") +
  geom_line() +
  geom_errorbar(width=.1) +
  geom_point(cex=3) + 
  theme_bw() +
  theme(aspect.ratio=1) +
  theme(legend.title=element_blank())

4 way interaction split over items

rts <- summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("Number", "Vagueness", "Item", "Quantity", "Order"))
rts$condition=paste(rts$Number,rts$Vagueness, sep= ' ')
dodge = position_dodge(width=0.2)
ggplot(rts, aes(y=RT_RecSqt, x=Item, ymin=RT_RecSqt-ci, ymax=RT_RecSqt+ci, 
                group=Vagueness,fill=Vagueness)) +
  geom_line(position=dodge) + 
  geom_errorbar(width=.2, position=dodge) + 
  geom_point(pch=21, size=4, position=dodge) + 
  scale_fill_manual("",values=c("black","white","black","white")) +
  ggtitle("Number * Vagueness * Quantity * Order") + 
  ylab("RT_RecSqt") + 
  xlab("") +
  theme_bw() +
  theme( legend.key = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1), axis.text.x = element_text(angle = 45, hjust = 1), aspect.ratio=1) +
  facet_grid(Vagueness+Order~Number+Quantity)

4 way interaction split over items showing ratio of item dots.

rts <- summarySEwithin(dd, measurevar="RT_RecSqt", withinvars=c("Number", "Vagueness", "item_mean_ratio", "Quantity", "Order"))
rts$condition=paste(rts$Number,rts$Vagueness, sep= ' ')
rts$item_mean_ratio <- as.numeric(as.character(rts$item_mean_ratio))
dodge = position_dodge(width=0.2)
ggplot(rts, aes(y=RT_RecSqt, x=item_mean_ratio, ymin=RT_RecSqt-ci, ymax=RT_RecSqt+ci, 
                group=Vagueness,fill=Vagueness)) +
  geom_line(position=dodge) + 
  geom_errorbar(width=.2, position=dodge) + 
  geom_point(pch=21, size=4, position=dodge) + 
  scale_fill_manual("",values=c("black","white","black","white")) +
  ggtitle("Number * Vagueness * Quantity * Order") + 
  ylab("RT_RecSqt") + 
  xlab("") +
  theme_bw() +
  theme( legend.key = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1), axis.text.x = element_text(angle = 45, hjust = 1), aspect.ratio=1) +
  facet_grid(Vagueness+Order~Number+Quantity)

Show compression of item ratios

Model ols()

ols model v4, RecSqt, in progress

load("dd.Rda")
suppressPackageStartupMessages(require(rms))
suppressPackageStartupMessages(require(languageR))
options(width=140)

Pairs plot

# pairs plot
pairs(dd[,c("RT_RecSqt", "Number", "Vagueness", "RTprev_RecSqt")], pch = 19, cex=.075)

Build the model

## do ols model
# define datadist
dd.dd = datadist(dd)
options(datadist = "dd.dd")
# build model
v4 <- ols(data=dd, 
          RT_RecSqt ~ 
            c_Vag + c_Num + c_Qty + c_Ord + 
            item_mean_ratio + 
            #c_Vag:c_Num + 
            #c_Vag:c_Num:item_mean_ratio + 
            # pol(s_Trl, 3) +
            s_Trl+
            # pol(RTprev_RecSqt,3) +
            RTprev_RecSqt +
            nchar_instr+
            # cell+
            measurement,
          x=T, y=T )
# print out p value for likelihood ratio with df
cat("lik.ratio p value, whether the model as a whole is explanatory: p =", 1-pchisq(v4$stats[2],v4$stats[3]))
## lik.ratio p value, whether the model as a whole is explanatory: p = 0
# print out R2 for the model
cat("R^2 = ", v4$stats[4])
## R^2 =  0.2898379

Show the model tables

# show model
v4
## 
## Linear Regression Model
## 
## ols(formula = RT_RecSqt ~ c_Vag + c_Num + c_Qty + c_Ord + item_mean_ratio + 
##     s_Trl + RTprev_RecSqt + nchar_instr + measurement, data = dd, 
##     x = T, y = T)
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
## Obs     7677    LR chi2   2627.55    R2       0.290    
## sigma 0.0062    d.f.            9    R2 adj   0.289    
## d.f.    7667    Pr(> chi2) 0.0000    g        0.004    
## 
## Residuals
## 
##        Min         1Q     Median         3Q        Max 
## -2.099e-02 -4.118e-03  6.474e-05  4.233e-03  1.950e-02 
## 
##                 Coef    S.E.   t      Pr(>|t|)
## Intercept        0.0302 0.0011  26.78 <0.0001 
## c_Vag           -0.0007 0.0001  -4.66 <0.0001 
## c_Num            0.0043 0.0001  29.71 <0.0001 
## c_Qty            0.0009 0.0001   6.27 <0.0001 
## c_Ord           -0.0004 0.0001  -2.59 0.0095  
## item_mean_ratio -0.0077 0.0006 -12.59 <0.0001 
## s_Trl            0.0001 0.0003   0.29 0.7710  
## RTprev_RecSqt    0.0030 0.0001  42.03 <0.0001 
## nchar_instr     -0.0001 0.0000  -3.07 0.0021  
## measurement      0.0003 0.0001   1.86 0.0632
# show model summary
summary(v4)
##              Effects              Response : RT_RecSqt 
## 
##  Factor          Low      High     Diff.    Effect      S.E.       Lower 0.95  Upper 0.95 
##  c_Vag           -0.50000  0.50000 1.000000 -0.00068336 1.4656e-04 -9.7066e-04 -3.9605e-04
##  c_Num           -0.50000  0.50000 1.000000  0.00429050 1.4443e-04  4.0074e-03  4.5737e-03
##  c_Qty           -0.50000  0.50000 1.000000  0.00088397 1.4103e-04  6.0752e-04  1.1604e-03
##  c_Ord           -0.50000  0.50000 1.000000 -0.00036562 1.4098e-04 -6.4197e-04 -8.9263e-05
##  item_mean_ratio  0.68765  0.76916 0.081509 -0.00062600 4.9733e-05 -7.2349e-04 -5.2851e-04
##  s_Trl           -0.85921  0.87274 1.732000  0.00017291 5.9392e-04 -9.9134e-04  1.3372e-03
##  RTprev_RecSqt   -0.61685  0.63201 1.248900  0.00375500 8.9332e-05  3.5799e-03  3.9301e-03
##  nchar_instr     30.00000 36.00000 6.000000 -0.00043872 1.4274e-04 -7.1854e-04 -1.5891e-04
##  measurement      2.00000  6.00000 4.000000  0.00111280 5.9890e-04 -6.1196e-05  2.2868e-03
# do anova 
an1 = anova(v4)
# show anova
an1
##                 Analysis of Variance          Response: RT_RecSqt 
## 
##  Factor          d.f. Partial SS   MS           F       P     
##  c_Vag              1 8.288338e-04 8.288338e-04   21.74 <.0001
##  c_Num              1 3.364681e-02 3.364681e-02  882.51 <.0001
##  c_Qty              1 1.497985e-03 1.497985e-03   39.29 <.0001
##  c_Ord              1 2.564381e-04 2.564381e-04    6.73 0.0095
##  item_mean_ratio    1 6.040552e-03 6.040552e-03  158.44 <.0001
##  s_Trl              1 3.231490e-06 3.231490e-06    0.08 0.7710
##  RTprev_RecSqt      1 6.736390e-02 6.736390e-02 1766.86 <.0001
##  nchar_instr        1 3.601577e-04 3.601577e-04    9.45 0.0021
##  measurement        1 1.316315e-04 1.316315e-04    3.45 0.0632
##  REGRESSION         9 1.193022e-01 1.325580e-02  347.68 <.0001
##  ERROR           7667 2.923148e-01 3.812636e-05

plot ‘partial effects’

## plot partial effects
# Compute Predicted Values and Confidence Limits
p1=Predict(v4)
# plot 'partial effects'
plot(p1, anova=an1, pval=T, aspect=1, main="ols model v4")

# add predicted RT_RecSqt to dd
dd$RT_Predicted <- predict (v4)

Plot model coefficients and ci’s

par(las=2, mar=c(14,6,1,1))
y=coef(v4) # with intercept
n=length(y)
y0=confint(v4)[1:n,1]
y1=confint(v4)[1:n,2]

y=coef(v4)[-1] # omit intercept
n=length(y)
y0=confint(v4)[-1,1]
y1=confint(v4)[-1,2]

plot(y, xaxt="n", xlab="", ylab="RT_RecSqt\n", pch=19, ylim=extendrange(y, f=.10), main="Speed")
abline(h=0)
axis(1, labels=names(y) , at=1:n)
grid()
segments(x0=1:n, x1=1:n, y0, y1, lwd=2)

Collinearity is assessed by the condition number \(k\).
The greater the collinearity, the closer the matrix of predictors is is to becoming singular.
\(k\) estimates the extent to which a matrix is \(\text{singular}\)
\(0\ldots6\) no collinearity
\(\text{around} 15\) medium collinearity
\(>30\) indicates potentially harmful collinearity

cat("k is:",
collin.fnc(dd[,c("s_Trl", "c_Num", "c_Vag", "c_Qty", "c_Ord", "item_mean_ratio", "RTprev_RecSqt", "nchar_instr", "measurement")])$cnumber
)
## k is: 38.03642
par(mar=c(1.1,3.1,1.1,1.1), pty='s')
plot(varclus(as.matrix(dd[,c("s_Trl", "c_Num", "c_Vag", "c_Qty", "c_Ord", "item_mean_ratio", "RTprev_RecSqt", "nchar_instr", "measurement")])))

Model criticism baayen plots

# Baayen 4-plot model criticism

par(mfrow=c(1,4), pty='s')
# create scaled residuals
v4$rstand = as.vector(scale(resid(v4)))
# plot scaled residuals density
plot(density(v4$rstand))
# plot sample quantiles versus theoretical quantiles
qqnorm(v4$rstand, cex=.5)
qqline(v4$rstand)
# plot standardised residuals versus fitted values
plot(v4$rstand ~ fitted(v4), pch='.')
# absolute standardised residuals greater than 2.5 are candidates for being outliers, the abline identifies them on the plot
abline(h=c(-2.5,2.5))
# create dffits
dffits=abs(resid(v4, "dffits"))
# plot dffits
plot(dffits, type='h')

# here no overly influential
w = which.influence(fit = v4, cutoff = 0.4)
w
## list()

Model validation

Validation tests overfitting using bootstrap.

Bootstrapping draws as a bootstrap sample the same number of samples as the original data, at random with replacement, from the original data.

Then fit the model to the data in the bootstrap sample, and use this model to predict the reaction times for the original full data (which contains many samples that are new to the bootstrap model).

Then compare the goodness of fit of the bootstrap model with the goodness of fit of the original model.

Averaged over a large number of bootstrap models these comparisons of goodness of fit reveal to what extent the original model overfits the data.

Function validate() in rms does this re-sampling validation.

# argument B is the number of bootstrap runs
# argument pr is whether to print the results of each run
v_1 <- validate(v4, bw = T, B = 200, pr=FALSE, estimates=TRUE)
## 
##      Backwards Step-down - Original Model
## 
##  Deleted Chi-Sq d.f. P     Residual d.f. P     AIC   R2  
##  s_Trl   0.08   1    0.771 0.08     1    0.771 -1.92 0.29
## 
## Approximate Estimates after Deleting Factors
## 
##                       Coef      S.E.  Wald Z         P
## Intercept        3.004e-02 8.914e-04  33.701 0.000e+00
## c_Vag           -6.837e-04 1.466e-04  -4.665 3.082e-06
## c_Num            4.290e-03 1.444e-04  29.706 0.000e+00
## c_Qty            8.844e-04 1.410e-04   6.272 3.573e-10
## c_Ord           -3.662e-04 1.410e-04  -2.598 9.372e-03
## item_mean_ratio -7.667e-03 6.086e-04 -12.598 0.000e+00
## RTprev_RecSqt    3.007e-03 7.153e-05  42.036 0.000e+00
## nchar_instr     -7.308e-05 2.379e-05  -3.072 2.127e-03
## measurement      3.208e-04 3.121e-05  10.280 0.000e+00
## 
## Factors in Final Model
## 
## [1] c_Vag           c_Num           c_Qty           c_Ord           item_mean_ratio RTprev_RecSqt   nchar_instr     measurement
# in print.validate, B is the number of re-samples to show outocmes for each resample
print(v_1, B=0)
##           index.orig training   test optimism index.corrected   n
## R-square      0.2898   0.2901 0.2887   0.0014          0.2885 200
## MSE           0.0000   0.0000 0.0000   0.0000          0.0000 200
## g             0.0045   0.0045 0.0045   0.0000          0.0045 200
## Intercept     0.0000   0.0000 0.0000   0.0000          0.0000 200
## Slope         1.0000   1.0000 0.9984   0.0016          0.9984 200
# show how often each number of variables kept was observed across the 200 runs
xtabs(~rowSums(attr(v_1,"kept")))
## rowSums(attr(v_1, "kept"))
##   5   6   7   8   9 
##   1   6  63 124   6